*****************************************************************************************

* This file imports and cleans the building permit data from the Census Building Permits Surveys

*****************************************************************************************

/* steps in below file

1. import and clean surveys separately by year ranges (based on when surveys have same format) 
2. append these files and clean them
3. harmonize ids across years and prepare them for merging to other files

*/


** set directory

* import data 
* CHUNK 1: 1980-1998

use "19801998permits.dta", clear

rename Date Survey_Date
rename Code StateCode
rename ID six_dig_it
rename v4 countyfips
rename CMSA MSA_CMSA_code
rename v6 PMSA_Code
rename v7 Region_Code
rename v8 Division_Code
rename v10 Redundant_MonthsRep
rename Bldgs Bldgs_1unit_census
rename Units Units_1unit_census
rename Value Value_1unit_census
rename v15 Bldgs_2unit_census
rename v16 Units_2unit_census
rename v17 Value_2unit_census
rename v18 Bldgs_3or4unit_census
rename v19 Units_3or4unit_census
rename v20 Value_3or4unit_census
rename v21 Bldgs_5unit_census
rename v22 Units_5unit_census
rename v23 Value_5unit_census
rename v24 Bldgs_1unit_reported
rename v25 Units_1unit_reported
rename v26 Value_1unit_reported
rename v27 Bldgs_2unit_reported
rename v28 Units_2unit_reported
rename v29 Value_2unit_reported
rename v30 Bldgs_3or4unit_reported
rename v31 Units_3or4unit_reported
rename v32 Value_3or4unit_reported
rename v33 Bldgs_5unit_reported
rename v34 Units_5unit_reported
rename v35 Value_5unit_reported

replace Bldgs_1unit_census = "FLAG" if v36 == 445000 | v36 == 0 
replace Bldgs_1unit_census = "FLAG" if v36 == 445000

gen v37 = "FLAG" if Bldgs_1unit_census == "FLAG"
tostring(v36), replace

replace Bldgs_1unit_census=Units_1unit_census if v37 == "FLAG"
replace Units_1unit_census=Value_1unit_census if v37 == "FLAG"
replace Value_1unit_census=Bldgs_2unit_census if v37 == "FLAG"
replace Bldgs_2unit_census=Units_2unit_census if v37 == "FLAG"
replace Units_2unit_census=Value_2unit_census if v37 == "FLAG"
replace Value_2unit_census=Bldgs_3or4unit_census if v37 == "FLAG"
replace Bldgs_3or4unit_census=Units_3or4unit_census if v37 == "FLAG"
replace Units_3or4unit_census=Value_3or4unit_census if v37 == "FLAG"
replace Value_3or4unit_census=Bldgs_5unit_census if v37 == "FLAG"
replace Bldgs_5unit_census=Units_5unit_census if v37 == "FLAG"
replace Units_5unit_census=Value_5unit_census if v37 == "FLAG"
replace Value_5unit_census=Bldgs_1unit_reported if v37 == "FLAG"
replace Bldgs_1unit_reported=Units_1unit_reported if v37 == "FLAG"
replace Units_1unit_reported=Value_1unit_reported if v37 == "FLAG"
replace Value_1unit_reported=Bldgs_2unit_reported if v37 == "FLAG"
replace Bldgs_2unit_reported=Units_2unit_reported if v37 == "FLAG"
replace Units_2unit_reported=Value_2unit_reported if v37 == "FLAG"
replace Value_2unit_reported=Bldgs_3or4unit_reported if v37 == "FLAG"
replace Bldgs_3or4unit_reported=Units_3or4unit_reported if v37 == "FLAG"
replace Units_3or4unit_reported=Value_3or4unit_reported if v37 == "FLAG"
replace Value_3or4unit_reported=Bldgs_5unit_reported if v37 == "FLAG"
replace Bldgs_5unit_reported=Units_5unit_reported if v37 == "FLAG"
replace Units_5unit_reported=Value_5unit_reported if v37 == "FLAG"
replace Value_5unit_reported = v36 if v37 == "FLAG"
drop v36
drop v37

duplicates drop *, force
save "19801998permits_cleaned.dta", replace

* CHUNK 2: 1999-2002

clear

use "1999-2002permits.dta", clear

rename Date Survey_Date
rename Code StateCode
rename ID six_dig_it
rename CMSA MSA_CMSA_code
rename v4 countyfips
rename v7 PMSA_Code
rename v9 zipcode
rename v10 Region_Code
rename v11 Division_Code
rename v13 Redundant_MonthsRep
rename Bldgs Bldgs_1unit_census
rename Units Units_1unit_census
rename Value Value_1unit_census
rename v18 Bldgs_2unit_census
rename v19 Units_2unit_census
rename v20 Value_2unit_census
rename v21 Bldgs_3or4unit_census
rename v22 Units_3or4unit_census
rename v23 Value_3or4unit_census
rename v24 Bldgs_5unit_census
rename v25 Units_5unit_census
rename v26 Value_5unit_census
rename v27 Bldgs_1unit_reported
rename v28 Units_1unit_reported
rename v29 Value_1unit_reported
rename v30 Bldgs_2unit_reported
rename v31 Units_2unit_reported
rename v32 Value_2unit_reported
rename v33 Bldgs_3or4unit_reported
rename v34 Units_3or4unit_reported
rename v35 Value_3or4unit_reported
rename v36 Bldgs_5unit_reported
rename v37 Units_5unit_reported
rename v38 Value_5unit_reported

duplicates drop *, force
save "1999-2002permits_cleaned.dta", replace


* CHUNK 3: 2003

clear
use "2003permits.dta", clear

rename Date Survey_Date
rename Code StateCode
rename ID six_dig_it
rename CSA CSA_Code
rename v4 countyfips
rename v7 CBSA_Code
rename v9 zipcode
rename v10 Region_Code
rename v11 Division_Code
rename v13 Redundant_MonthsRep
rename Bldgs Bldgs_1unit_census
rename Units Units_1unit_census
rename Value Value_1unit_census
rename v18 Bldgs_2unit_census
rename v19 Units_2unit_census
rename v20 Value_2unit_census
rename v21 Bldgs_3or4unit_census
rename v22 Units_3or4unit_census
rename v23 Value_3or4unit_census
rename v24 Bldgs_5unit_census
rename v25 Units_5unit_census
rename v26 Value_5unit_census
rename v27 Bldgs_1unit_reported
rename v28 Units_1unit_reported
rename v29 Value_1unit_reported
rename v30 Bldgs_2unit_reported
rename v31 Units_2unit_reported
rename v32 Value_2unit_reported
rename v33 Bldgs_3or4unit_reported
rename v34 Units_3or4unit_reported
rename v35 Value_3or4unit_reported
rename v36 Bldgs_5unit_reported
rename v37 Units_5unit_reported
rename v38 Value_5unit_reported
duplicates drop *, force
save "2003permits_cleaned.dta", replace


* CHUNK 4: 2004-2006

**
clear

use "2004-2006permits.dta", clear

rename Date Survey_Date
rename Code StateCode
rename ID six_dig_it
rename v4 countyfips
rename v6 CSA_Code
rename v7 CBSA_Code
rename v8 footnote
rename v10 zipcode
rename v11 Region_Code
rename v12 Division_Code
rename Bldgs Bldgs_1unit_census
rename Units Units_1unit_census
rename Value Value_1unit_census
rename v18 Bldgs_2unit_census
rename v19 Units_2unit_census
rename v20 Value_2unit_census
rename v21 Bldgs_3or4unit_census
rename v22 Units_3or4unit_census
rename v23 Value_3or4unit_census
rename v24 Bldgs_5unit_census
rename v25 Units_5unit_census
rename v26 Value_5unit_census
rename v27 Bldgs_1unit_reported
rename v28 Units_1unit_reported
rename v29 Value_1unit_reported
rename v30 Bldgs_2unit_reported
rename v31 Units_2unit_reported
rename v32 Value_2unit_reported
rename v33 Bldgs_3or4unit_reported
rename v34 Units_3or4unit_reported
rename v35 Value_3or4unit_reported
rename v36 Bldgs_5unit_reported
rename v37 Units_5unit_reported
rename v38 Value_5unit_reported
duplicates drop *, force
save "2004-2006permits_cleaned.dta", replace

* CHUNK 5: 2007-2018
clear

use "2007-2018permits.dta", replace

rename Date Survey_Date
rename Code StateCode
rename ID six_dig_it
rename v4 countyfips
rename v5 census_pl_code
rename v6 a_placefips
rename v7 b_placefips
rename v8 pop
rename v9 CSA_Code
rename v10 CBSA_Code
rename v11 footnote
rename v13 zipcode
rename v14 Region_Code
rename v15 Division_Code
rename Bldgs Bldgs_1unit_census
rename Units Units_1unit_census
rename Value Value_1unit_census
rename v21 Bldgs_2unit_census
rename v22 Units_2unit_census
rename v23 Value_2unit_census
rename v24 Bldgs_3or4unit_census
rename v25 Units_3or4unit_census
rename v26 Value_3or4unit_census
rename v27 Bldgs_5unit_census
rename v28 Units_5unit_census
rename v29 Value_5unit_census
rename v30 Bldgs_1unit_reported
rename v31 Units_1unit_reported
rename v32 Value_1unit_reported
rename v33 Bldgs_2unit_reported
rename v34 Units_2unit_reported
rename v35 Value_2unit_reported
rename v36 Bldgs_3or4unit_reported
rename v37 Units_3or4unit_reported
rename v38 Value_3or4unit_reported
rename v39 Bldgs_5unit_reported
rename v40 Units_5unit_reported
rename v41 Value_5unit_reported
duplicates drop *, force

* need a small fix to place fips for this year to match others
gen placefips = a_placefips
replace placefips = b_placefips if StateCode == "09" | StateCode == "23" | StateCode == "44" | StateCode == "50" | StateCode == "25" | StateCode == "33" | StateCode == "34" | StateCode == "42" | StateCode == "36"

**
save "2007-2018_permits_master_cleaned.dta", replace


* with files imported, we can now append them all together
clear
use "19801998permits_cleaned.dta"
append using  "1999-2002permits_cleaned.dta", gen(M1)
append using  "2003permits_cleaned.dta", gen(M2)
append using  "2004-2006permits_cleaned.dta", gen(M3)
append using  "2007-2018_permits_master_cleaned.dta", gen(M4)
save "1980-2018_permits_master.dta", replace
drop M1
drop M2
drop M3
drop M4

*Use the response variable to calculate which variables were imputed versus directly listed
gen response = 0
replace response = 1 if Bldgs_1unit_census == Bldgs_1unit_reported

* some text in top row of initial spreadsheet was imported in error.  we drop these.
keep if Survey_Date == "8099" |Survey_Date == "8199" |Survey_Date == "8299"|Survey_Date == "8399"|Survey_Date == "8499"|Survey_Date == "8599"|Survey_Date == "8699"|Survey_Date == "8799"|Survey_Date == "8899"|Survey_Date == "8999"|Survey_Date == "9099"|Survey_Date == "9199"|Survey_Date == "9299"|Survey_Date == "9399"||Survey_Date == "9499"|Survey_Date == "9599"|Survey_Date == "9699" |Survey_Date == "9799"|Survey_Date == "9899"|Survey_Date == "1999"|Survey_Date == "2000"|Survey_Date == "2001"|Survey_Date == "2002"|Survey_Date == "2003"|Survey_Date == "2004"|Survey_Date == "2005" |Survey_Date == "2006"|Survey_Date == "2007"|Survey_Date == "2008"|Survey_Date == "2009"|Survey_Date == "2010"|Survey_Date == "2011"|Survey_Date == "2012"|Survey_Date == "2013"|Survey_Date == "2014"|Survey_Date == "2015"|Survey_Date == "2016"|Survey_Date == "2017"|Survey_Date == "2018"

* generate a year variable from survey data
gen year = "", after(Survey_Date)
replace year = "19" + substr(Survey_Date, 1, 2) if ///
	substr(Survey_Date, 3, 2) == "99" & substr(Survey_Date, 1, 2) != "19" 
replace year = Survey_Date if year == ""
destring year, replace

* cleaning up names 
rename Name OriginalName
gen name = OriginalName
replace name = strproper(name)
replace name = subinstr(name, " .", " ", .)
replace name = strtrim(name)
replace name = substr(name, 1, strlen(name)-1) if substr(name, -1, 1) == "."
replace name = subinstr(name, " (N)", " ", .)	
replace name = subinstr(name, " #", " ", .)	
replace name = subinstr(name, "@4", " ", .)	
replace name = subinstr(name, "@5", " ", .)
replace name = subinstr(name, "@3", " ", .)
replace name = subinstr(name, "@2", " ", .)
replace name = subinstr(name, "@1", " ", .)
replace name = subinstr(name, "...", " ", .)
replace name = subinstr(name, "..........", " ", .)
replace name = subinstr(name, "*", " ", .)
replace name = strtrim(name)

* make ids match other data sets
rename six_dig_it id
destring(StateCode), generate(statefips)
destring(countyfips), replace
destring(id), replace

drop StateCode MSA_ PMSA_Code Redundant_MonthsRep v5 CSA_ CBSA_ Survey_Date

foreach v of varlist year-name {
	local lab =  "`: var label `v''"
	local lab = "`lab' from Census Building Permits"
	di "`lab'"
	label var `v' "`lab'"
}


* drop unincorporated and strange places
gen uniflag = 0

gen drop_name = lower(name)
replace uniflag= 1 if strpos(drop_name, "uninc") > 0
replace uniflag= 1 if strpos(drop_name, "bal. of co.") > 0
replace uniflag= 1 if strpos(drop_name, "balance of county") > 0

drop if uniflag == 1

drop uniflag


* make fips id that matches icma and nhgis ids
egen uniq = group(name statefip countyfip)

* destring some variables and create some logs as well as aggregations
destring(pop), replace 
destring(Units_1unit_census), replace 
destring(Units_2unit_census), replace 
destring(Units_3or4unit_census), replace 
destring(Units_5unit_census), replace 
destring(Bldgs_1unit_census Units_1unit_census Value_1unit_census Bldgs_2unit_census Units_2unit_census Value_2unit_census Bldgs_3or4unit_census Units_3or4unit_census Value_3or4unit_census Bldgs_5unit_census Units_5unit_census Value_5unit_census Bldgs_1unit_reported Units_1unit_reported Value_1unit_reported Bldgs_2unit_reported Units_2unit_reported Value_2unit_reported Bldgs_3or4unit_reported Units_3or4unit_reported Value_3or4unit_reported Bldgs_5unit_reported Units_5unit_reported Value_5unit_reported), replace
destring(placefips), replace

* clean up ids

* we can check if names and ids match up, or if we have multiple ids/names within a single name/id
egen tag = tag(statefips countyfips id name)
bysort statefips countyfips id (year): egen id_spans = total(tag)
bysort statefips countyfips name (year): egen name_spans = total(tag)
drop tag

// If name spans 1 and id spans 1, then nothing can be done to harmonize 
//	because there's no overlap to exploit
* (this means either that id and name are constant throughout whole sample, or that they both switch in one year)
gen id_final = id if id_spans == 1 & name_spans == 1 
gen name_final = name if id_spans == 1 & name_spans == 1

* if that's not the case, we can leverage names and ids that span breaks in ids/names
forvalues yr = 2018(-1)1980 {

	// Fills in the first set of IDs with the earliest Name, and the first set
	//	of names with the earlier ID... This ensures that regardless of which
	//	variables spans the first break, we've successfully made it across.
	bysort statefips countyfips id (year): replace name_final = name[_N] if year[_N] == `yr' & name_final == ""
	bysort statefips countyfips name (year): replace id_final = id[_N] if year[_N] == `yr' & id_final == .
	
	// Now we have either an id_final that spans a name break, or a name_final
	//	that spans an ID break. 
	bysort statefips countyfips id_final (year): replace name_final = name_final[_N] if name_final[_n] == "" & id_final != . & name_final[_N] != ""
	bysort statefips countyfips name_final (year): replace id_final = id_final[_N] if id_final[_n] == . & name_final != "" & id_final[_N] != .	
	
	local i = 1
	while `i' <= 3 {
	
		bysort statefips countyfips id (year): replace name_final = name_final[_N] if name_final[_N] != ""  & name_final[_n] == ""
		bysort statefips countyfips name (year): replace id_final = id_final[_N] if id_final[_N] != . & id_final[_n] == .

		bysort statefips countyfips id_final (year): replace name_final = name_final[_N] if name_final[_n] == "" & id_final != . & name_final[_N] != ""
		bysort statefips countyfips name_final (year): replace id_final = id_final[_N] if id_final[_n] == . & name_final != "" & id_final[_N] != .
	
	local i = `i' + 1
	}
}

* harmonize placefips and drop balances and unincorporated places*
* b_placefips is needed for one weird region that uses different codes
destring(placefips), replace
destring(b_placefips), replace
replace placefips = b_placefips if Region == "2" & year > 2006 & placefips ==.

bysort statefips countyfips id_final (year): replace name_final = name_final[_N]
bysort statefips countyfips name_final (year): replace id_final = id_final[_N]

gen placefips_final=placefips, after(countyfips)
bysort statefips countyfips id_final (year): replace placefips_final = placefips[_N]
order year statefips countyfips placefips_final id_final name_final id name, first
replace placefips = placefips_final if placefips == . & placefips_final != .

// Drop counties with Place fips of missing, zero, or 99990
drop if inlist(placefips_final, ., 0, 99990) & strpos(name_final, "County") > 0
drop if placefips_final == 99990 // This includes only whole Parishes or Boroughs

// Replace placefips_final to missing if zero for later dropping
replace placefips_final = . if placefips_final == 0
foreach v of varlist placefips_final-name_final {
	local lab =  "`: var label `v''"
	local lab = "`lab' Harmonized variable from Permits data"
	di "`lab'"
	label var `v' "`lab'"
}

drop if placefips_final == .
drop a_placefips b_placefips placefips id name Region_Code Division_Code MonthsRep OriginalName City zipcode footnote census_pl_code pop response id_spans name_spans

save "1980-2018_permits_master_WF_v2.dta", replace

* next, we make the data rectangular, filling in missing years
use "1980-2018_permits_master_WF_v2.dta", clear

* merge to id list
merge m:1 statefips countyfips id_final using "id_placefips.dta", gen(m2)
drop if m2 == 1
drop m2

* fill in missing years
egen id2 = group(statefips countyfips placefips_final)
order id2, after(year)
duplicates drop id2 year, force
tsset id2 year
fillin id2 year
sum placefips id2

order placefips, after(statefips)

* fill in missing values for the filled in years
drop if year == .
drop if id2 == .

foreach var of varlist statefips - id_final {
	bysort id2 (year): egen fill_`var' = mode(`var') 
	replace `var' = fill_`var' if fill_`var' != .
	drop fill_`var'
}

foreach var of varlist name_final {
	bysort id2 (year): egen fill_`var' = mode(`var') 
	replace `var' = fill_`var' if fill_`var' != ""
	drop fill_`var'
}


* finally, try to fill in some more missing placefips based on place names

* create municipality dummy
gen township = 1 if strpos(name_final, "Township") > 1
gen village = 1 if strpos(name_final, "Village") > 1
gen borough = 1 if strpos(name_final, "Borough") > 1
gen town = 1 if strpos(name_final, "Town") > 1

* generate name that matches crosswalk
gen place = name_final + " City"  if village == . & township == .& town == . & borough == . & name_final != ""
replace place = name_final if place == ""

* merge in crosswalk and update missings
merge m:1 statefips countyfips place using "placefips_2010", update keep(1 3 4 5) gen(m7)
merge m:1 statefips countyfips place using "placefips_2010_ICMA", update keep(1 3 4 5) gen(m8)

* drop missings and save
drop if placefips == .
drop _fillin
save "1980-2018_permits_master_WF_v4.dta", replace


